In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [2128]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected TSLA
In [2129]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [2130]:
pd.set_option('display.max_colwidth', None)
In [2131]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [2132]:
del df['Unnamed: 0']
In [2133]:
df.head(5)
Out[2133]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-12-10 574.369995 627.750000 566.340027 627.070007 627.070007 67083200 3.737101 0.799498 711.908644 40.664306 665.424660 558.698195 612.061428 NaN 6.484811 61.409973 5.161220 NaN NaN NaN 53.070007 NaN 0.092456 73.926700 NaN NaN 66.142314 77.814495 2.969433e+08 7.310341e+07 458017600.0 0.0 5.254501e+07 0.0 0.0 0.0 0.0 0.0 5.254501e+07 0.0 5.254501e+07 0.0 5.254501e+07 68 5130 5198 0 5198 0 5198 5198
1 2020-12-11 615.010010 624.000000 596.799988 609.989990 609.989990 46475000 -2.723782 0.598537 410.813247 39.179408 658.479911 577.405797 617.942854 NaN 6.422959 30.270020 5.161220 NaN NaN NaN 24.229980 NaN 0.041365 69.445194 NaN NaN 49.770800 63.400510 2.955422e+08 7.050876e+07 411542600.0 0.0 3.595884e+07 0.0 0.0 0.0 0.0 0.0 3.595884e+07 0.0 3.595884e+07 0.0 3.595884e+07 52 3361 3413 0 3413 0 3413 3413
2 2020-12-14 619.000000 642.750000 610.200012 639.830017 639.830017 52040600 4.891888 0.420026 349.025431 38.262351 661.943014 587.214125 624.578570 NaN 5.980081 32.760010 19.527795 NaN NaN NaN 72.230042 NaN 0.127255 72.573367 NaN NaN 67.390306 61.101140 3.382459e+08 7.662497e+07 463583200.0 0.0 4.056763e+07 0.0 0.0 0.0 0.0 0.0 4.056763e+07 0.0 4.056763e+07 0.0 4.056763e+07 51 3452 3503 0 3503 0 3503 3503
3 2020-12-15 643.280029 646.900024 623.799988 633.250000 633.250000 45071500 -1.028401 0.413194 242.708955 36.096306 660.623954 598.307477 629.465716 NaN 5.700167 23.100037 22.557114 NaN NaN NaN 48.489990 NaN 0.082923 70.850830 NaN NaN 69.731745 62.297617 3.300511e+08 6.955370e+07 418511700.0 0.0 1.085622e+08 0.0 0.0 0.0 0.0 0.0 1.085622e+08 0.0 1.085622e+08 0.0 1.085622e+08 44 3494 3538 0 3538 0 3538 3538
4 2020-12-16 628.229980 632.500000 605.000000 622.770020 622.770020 42095800 -1.654952 0.830782 220.161180 34.975405 656.428519 597.077201 626.752860 70.255452 5.616103 28.250000 2.220025 NaN NaN NaN 53.950012 12.906767 0.094845 68.079269 NaN NaN 76.542960 71.221670 3.423584e+08 6.425382e+07 376415900.0 0.0 5.524962e+07 0.0 0.0 0.0 0.0 0.0 5.524962e+07 0.0 5.524962e+07 0.0 5.524962e+07 28 3420 3448 0 3448 0 3448 3448
In [2134]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       259 non-null    datetime64[ns]
 1   Open                       259 non-null    float64       
 2   High                       259 non-null    float64       
 3   Low                        259 non-null    float64       
 4   Close                      259 non-null    float64       
 5   Adj Close                  259 non-null    float64       
 6   Volume                     259 non-null    int64         
 7   Return                     259 non-null    float64       
 8   Beta                       259 non-null    float64       
 9   Variance                   259 non-null    float64       
 10  AvgTrueRange               259 non-null    float64       
 11  Upperband                  259 non-null    float64       
 12  Lowerband                  259 non-null    float64       
 13  Middleband                 259 non-null    float64       
 14  APO                        255 non-null    float64       
 15  NATR                       259 non-null    float64       
 16  TRANGE                     259 non-null    float64       
 17  DMI                        259 non-null    float64       
 18  MACD                       247 non-null    float64       
 19  MACDSIGNAL                 247 non-null    float64       
 20  MACDHIST                   247 non-null    float64       
 21  MOM                        259 non-null    float64       
 22  PPO                        255 non-null    float64       
 23  ROCP                       259 non-null    float64       
 24  RSI                        259 non-null    float64       
 25  TRIX                       192 non-null    float64       
 26  ULTOSC                     252 non-null    float64       
 27  SLOWK                      259 non-null    float64       
 28  SLOWD                      259 non-null    float64       
 29  AD                         259 non-null    float64       
 30  ADOSC                      259 non-null    float64       
 31  OBV                        259 non-null    float64       
 32  Upward_momentum_created    259 non-null    float64       
 33  Downward_momentum_created  259 non-null    float64       
 34  B5_O_Um                    259 non-null    float64       
 35  B5_C_Um                    259 non-null    float64       
 36  B5_E_Um                    259 non-null    float64       
 37  B5_A_Um                    259 non-null    float64       
 38  B5_N_Um                    259 non-null    float64       
 39  B5_O_Dm                    259 non-null    float64       
 40  B5_C_Dm                    259 non-null    float64       
 41  B5_E_Dm                    259 non-null    float64       
 42  B5_A_Dm                    259 non-null    float64       
 43  B5_N_Dm                    259 non-null    float64       
 44  Verified_status_True       259 non-null    int64         
 45  Verified_status_False      259 non-null    int64         
 46  O                          259 non-null    int64         
 47  C                          259 non-null    int64         
 48  E                          259 non-null    int64         
 49  A                          259 non-null    int64         
 50  N                          259 non-null    int64         
 51  Real_or_Fake_tweet         259 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 105.3 KB
In [2135]:
df.shape
Out[2135]:
(259, 52)
In [2136]:
sns.set(font_scale=0.8)
In [2137]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [2138]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [2139]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [2140]:
df.head()
Out[2140]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-12-10 574.369995 627.750000 566.340027 627.070007 627.070007 67083200 3.737101 0.799498 711.908644 40.664306 665.424660 558.698195 612.061428 NaN 6.484811 61.409973 5.161220 NaN NaN NaN 53.070007 NaN 0.092456 73.926700 NaN NaN 66.142314 77.814495 2.969433e+08 7.310341e+07 458017600.0 0.0 5.254501e+07 0.0 0.0 0.0 0.0 0.0 5.254501e+07 0.0 5.254501e+07 0.0 5.254501e+07 68 5130 5198 0 5198 0 5198 5198 NaN NaN
1 2020-12-11 615.010010 624.000000 596.799988 609.989990 609.989990 46475000 -2.723782 0.598537 410.813247 39.179408 658.479911 577.405797 617.942854 NaN 6.422959 30.270020 5.161220 NaN NaN NaN 24.229980 NaN 0.041365 69.445194 NaN NaN 49.770800 63.400510 2.955422e+08 7.050876e+07 411542600.0 0.0 3.595884e+07 0.0 0.0 0.0 0.0 0.0 3.595884e+07 0.0 3.595884e+07 0.0 3.595884e+07 52 3361 3413 0 3413 0 3413 3413 -2.723782 -0.027616
2 2020-12-14 619.000000 642.750000 610.200012 639.830017 639.830017 52040600 4.891888 0.420026 349.025431 38.262351 661.943014 587.214125 624.578570 NaN 5.980081 32.760010 19.527795 NaN NaN NaN 72.230042 NaN 0.127255 72.573367 NaN NaN 67.390306 61.101140 3.382459e+08 7.662497e+07 463583200.0 0.0 4.056763e+07 0.0 0.0 0.0 0.0 0.0 4.056763e+07 0.0 4.056763e+07 0.0 4.056763e+07 51 3452 3503 0 3503 0 3503 3503 4.891888 0.047760
3 2020-12-15 643.280029 646.900024 623.799988 633.250000 633.250000 45071500 -1.028401 0.413194 242.708955 36.096306 660.623954 598.307477 629.465716 NaN 5.700167 23.100037 22.557114 NaN NaN NaN 48.489990 NaN 0.082923 70.850830 NaN NaN 69.731745 62.297617 3.300511e+08 6.955370e+07 418511700.0 0.0 1.085622e+08 0.0 0.0 0.0 0.0 0.0 1.085622e+08 0.0 1.085622e+08 0.0 1.085622e+08 44 3494 3538 0 3538 0 3538 3538 -1.028401 -0.010337
4 2020-12-16 628.229980 632.500000 605.000000 622.770020 622.770020 42095800 -1.654952 0.830782 220.161180 34.975405 656.428519 597.077201 626.752860 70.255452 5.616103 28.250000 2.220025 NaN NaN NaN 53.950012 12.906767 0.094845 68.079269 NaN NaN 76.542960 71.221670 3.423584e+08 6.425382e+07 376415900.0 0.0 5.524962e+07 0.0 0.0 0.0 0.0 0.0 5.524962e+07 0.0 5.524962e+07 0.0 5.524962e+07 28 3420 3448 0 3448 0 3448 3448 -1.654952 -0.016688
In [2141]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [2142]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [2143]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [2144]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [2145]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [2146]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [2147]:
df.describe()
Out[2147]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 192.000000 192.000000 192.000000 192.000000 192.000000 1.920000e+02 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 1.920000e+02 1.920000e+02 1.920000e+02 192.0 1.920000e+02 192.0 192.0 192.0 192.0 192.0 1.920000e+02 192.0 1.920000e+02 192.0 1.920000e+02 192.000000 192.000000 192.000000 192.0 192.000000 192.0 192.000000 192.000000 192.000000 192.000000 163.000000 156.000000
mean 770.562917 785.527549 754.949428 770.809322 770.809322 2.538587e+07 0.209910 0.766553 816.205489 32.716764 811.985427 721.842800 766.914113 13.673222 4.163695 32.844266 34.120150 11.499694 11.144082 0.355612 16.490469 1.597427 0.025935 55.885895 0.128767 52.965146 56.417876 56.597772 8.179319e+08 5.074038e+06 8.279505e+08 0.0 4.744614e+07 0.0 0.0 0.0 0.0 0.0 4.744614e+07 0.0 4.744614e+07 0.0 4.744614e+07 63.114583 4674.723958 4737.838542 0.0 4737.838542 0.0 4737.838542 4737.838542 0.209910 0.001669 0.027516 0.027280
std 170.456827 176.242155 163.122761 170.162721 170.162721 9.739109e+06 2.935941 0.357165 1408.852330 15.261383 193.398705 143.975228 166.811590 40.600020 1.334752 22.653615 25.457655 28.835383 27.082933 9.845492 89.727964 4.729407 0.107606 12.810017 0.327469 7.965855 23.807437 21.941664 9.839856e+07 1.330141e+07 2.325741e+08 0.0 3.400392e+07 0.0 0.0 0.0 0.0 0.0 3.400392e+07 0.0 3.400392e+07 0.0 3.400392e+07 39.247068 2210.142871 2245.294971 0.0 2245.294971 0.0 2245.294971 2245.294971 2.935941 0.029339 0.009119 0.009251
min 552.549988 566.210022 546.979980 563.460022 563.460022 9.800600e+06 -11.990296 -0.732332 8.112051 15.857966 594.664076 540.736287 578.178580 -59.704753 2.089077 9.169983 0.674235 -34.004668 -28.798312 -23.035439 -195.199951 -9.022349 -0.168725 30.137886 -0.272391 32.322336 7.978260 11.571510 6.785000e+08 -3.055088e+07 5.295056e+08 0.0 1.414173e+07 0.0 0.0 0.0 0.0 0.0 1.414173e+07 0.0 1.414173e+07 0.0 1.414173e+07 16.000000 2202.000000 2226.000000 0.0 2226.000000 0.0 2226.000000 2226.000000 -11.990296 -0.127723 0.014402 0.014402
25% 659.304993 666.410004 643.999985 656.250000 656.250000 1.850182e+07 -1.169610 0.527252 118.981362 21.287808 690.727727 626.965461 658.047141 -6.908829 3.064312 17.475006 15.442360 -6.910688 -6.338453 -3.397226 -28.335022 -0.787765 -0.040403 47.526313 -0.134923 46.635129 35.352979 38.382481 7.479462e+08 -3.536103e+06 6.383761e+08 0.0 2.757560e+07 0.0 0.0 0.0 0.0 0.0 2.757560e+07 0.0 2.757560e+07 0.0 2.757560e+07 40.000000 3179.750000 3222.250000 0.0 3222.250000 0.0 3222.250000 3222.250000 -1.169610 -0.011765 0.020371 0.020172
50% 710.839996 719.990021 701.989990 710.454987 710.454987 2.353565e+07 0.190414 0.737615 314.898087 25.122889 738.128392 670.284198 706.804278 10.612884 3.836345 27.290009 27.235595 11.245599 9.808429 1.213997 18.450012 1.573802 0.025185 54.030052 0.036283 52.901369 56.987522 57.054820 7.742272e+08 4.512362e+06 7.366648e+08 0.0 3.532263e+07 0.0 0.0 0.0 0.0 0.0 3.532263e+07 0.0 3.532263e+07 0.0 3.532263e+07 51.000000 4137.000000 4190.000000 0.0 4190.000000 0.0 4190.000000 4190.000000 0.190414 0.001902 0.024996 0.024343
75% 803.314987 813.092499 798.872513 807.059982 807.059982 3.054300e+07 1.840990 0.976373 729.995052 39.446491 807.736348 771.707313 789.859643 23.599107 5.139345 39.997482 48.490930 16.658031 16.146437 4.672527 52.612503 3.281958 0.079176 62.931757 0.267894 58.316329 78.418493 75.675742 8.586307e+08 1.230055e+07 9.989748e+08 0.0 5.338609e+07 0.0 0.0 0.0 0.0 0.0 5.338609e+07 0.0 5.338609e+07 0.0 5.338609e+07 74.000000 5444.000000 5513.500000 0.0 5513.500000 0.0 5513.500000 5513.500000 1.840990 0.018242 0.033108 0.032926
max 1234.410034 1243.489990 1217.000000 1229.910034 1229.910034 6.285210e+07 12.661595 1.893748 7315.693248 73.086370 1309.155080 1112.101288 1189.055699 159.212943 7.314463 162.979980 98.114812 114.203788 93.856278 30.888519 348.059998 16.619693 0.402010 94.198009 0.822076 77.940114 96.653419 93.889323 1.043720e+09 4.700579e+07 1.286465e+09 0.0 2.246558e+08 0.0 0.0 0.0 0.0 0.0 2.246558e+08 0.0 2.246558e+08 0.0 2.246558e+08 348.000000 21393.000000 21741.000000 0.0 21741.000000 0.0 21741.000000 21741.000000 12.661595 0.119218 0.047094 0.047094
In [2148]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2149]:
df = df.fillna(df.median())
In [2150]:
df.isna().sum()
Out[2150]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2151]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 67 to 258
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       192 non-null    datetime64[ns]
 1   Open                       192 non-null    float64       
 2   High                       192 non-null    float64       
 3   Low                        192 non-null    float64       
 4   Close                      192 non-null    float64       
 5   Adj Close                  192 non-null    float64       
 6   Volume                     192 non-null    int64         
 7   Return                     192 non-null    float64       
 8   Beta                       192 non-null    float64       
 9   Variance                   192 non-null    float64       
 10  AvgTrueRange               192 non-null    float64       
 11  Upperband                  192 non-null    float64       
 12  Lowerband                  192 non-null    float64       
 13  Middleband                 192 non-null    float64       
 14  APO                        192 non-null    float64       
 15  NATR                       192 non-null    float64       
 16  TRANGE                     192 non-null    float64       
 17  DMI                        192 non-null    float64       
 18  MACD                       192 non-null    float64       
 19  MACDSIGNAL                 192 non-null    float64       
 20  MACDHIST                   192 non-null    float64       
 21  MOM                        192 non-null    float64       
 22  PPO                        192 non-null    float64       
 23  ROCP                       192 non-null    float64       
 24  RSI                        192 non-null    float64       
 25  TRIX                       192 non-null    float64       
 26  ULTOSC                     192 non-null    float64       
 27  SLOWK                      192 non-null    float64       
 28  SLOWD                      192 non-null    float64       
 29  AD                         192 non-null    float64       
 30  ADOSC                      192 non-null    float64       
 31  OBV                        192 non-null    float64       
 32  Upward_momentum_created    192 non-null    float64       
 33  Downward_momentum_created  192 non-null    float64       
 34  B5_O_Um                    192 non-null    float64       
 35  B5_C_Um                    192 non-null    float64       
 36  B5_E_Um                    192 non-null    float64       
 37  B5_A_Um                    192 non-null    float64       
 38  B5_N_Um                    192 non-null    float64       
 39  B5_O_Dm                    192 non-null    float64       
 40  B5_C_Dm                    192 non-null    float64       
 41  B5_E_Dm                    192 non-null    float64       
 42  B5_A_Dm                    192 non-null    float64       
 43  B5_N_Dm                    192 non-null    float64       
 44  Verified_status_True       192 non-null    int64         
 45  Verified_status_False      192 non-null    int64         
 46  O                          192 non-null    int64         
 47  C                          192 non-null    int64         
 48  E                          192 non-null    int64         
 49  A                          192 non-null    int64         
 50  N                          192 non-null    int64         
 51  Fake_news                  192 non-null    int64         
 52  returns                    192 non-null    float64       
 53  log_returns                192 non-null    float64       
 54  vol_current                192 non-null    float64       
 55  vol_future                 192 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 85.5 KB
In [2152]:
df.shape
Out[2152]:
(192, 56)
In [2153]:
df=df.dropna()
In [2154]:
df.dtypes
Out[2154]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2155]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[2155]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f076c08e0d0>
In [2156]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [2157]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
vol_current     0.861448
NATR            0.838765
Upperband       0.828552
Middleband      0.805369
vol_future      0.791198
AD              0.773876
High            0.765297
Open            0.759796
Lowerband       0.753248
Adj Close       0.743594
Close           0.743594
TRANGE          0.737877
Low             0.730372
TRIX            0.710798
OBV             0.690430
Variance        0.637019
MACDSIGNAL      0.530534
Name: AvgTrueRange, dtype: float64
In [2158]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 8 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.838765
vol_current     0.774895
vol_future      0.605647
TRANGE          0.570414
Volume          0.536369
SLOWK          -0.511288
SLOWD          -0.515332
Name: NATR, dtype: float64
In [2159]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 24 strongly correlated values with TRANGE:
TRANGE                   1.000000
AvgTrueRange             0.737877
Volume                   0.684353
Upperband                0.672372
High                     0.658233
Verified_status_False    0.649044
Fake_news                0.647614
N                        0.647614
E                        0.647614
O                        0.647614
Middleband               0.642286
Open                     0.632049
vol_future               0.617318
Adj Close                0.612981
Close                    0.612981
AD                       0.602951
vol_current              0.601244
Lowerband                0.585139
Low                      0.583694
Variance                 0.578932
NATR                     0.570414
OBV                      0.566621
TRIX                     0.549118
MACDSIGNAL               0.505256
Name: TRANGE, dtype: float64
In [2160]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999969
Verified_status_True         0.897390
B5_N_Dm                      0.681087
B5_E_Dm                      0.681087
B5_O_Dm                      0.681087
Downward_momentum_created    0.681087
Volume                       0.676733
TRANGE                       0.647614
Variance                     0.527478
DMI                          0.503846
Name: O, dtype: float64
In [2161]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [2162]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999969
Verified_status_True         0.897390
B5_N_Dm                      0.681087
B5_E_Dm                      0.681087
B5_O_Dm                      0.681087
Downward_momentum_created    0.681087
Volume                       0.676733
TRANGE                       0.647614
Variance                     0.527478
DMI                          0.503846
Name: E, dtype: float64
In [2163]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [2164]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999969
Verified_status_True         0.897390
B5_N_Dm                      0.681087
B5_E_Dm                      0.681087
B5_O_Dm                      0.681087
Downward_momentum_created    0.681087
Volume                       0.676733
TRANGE                       0.647614
Variance                     0.527478
DMI                          0.503846
Name: N, dtype: float64
In [2165]:
df.columns
Out[2165]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [2166]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2167]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2168]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2169]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2170]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2171]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.767947
Fake_news                    0.681087
N                            0.681087
E                            0.681087
O                            0.681087
Verified_status_False        0.678283
Name: B5_O_Dm, dtype: float64
In [2172]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [2173]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.767947
Fake_news                    0.681087
N                            0.681087
E                            0.681087
O                            0.681087
Verified_status_False        0.678283
Name: B5_E_Dm, dtype: float64
In [2174]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [2175]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.767947
Fake_news                    0.681087
N                            0.681087
E                            0.681087
O                            0.681087
Verified_status_False        0.678283
Name: B5_N_Dm, dtype: float64
In [2176]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999969
Verified_status_True         0.897390
B5_N_Dm                      0.681087
B5_E_Dm                      0.681087
B5_O_Dm                      0.681087
Downward_momentum_created    0.681087
Volume                       0.676733
TRANGE                       0.647614
Variance                     0.527478
DMI                          0.503846
Name: Fake_news, dtype: float64
In [2177]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.767947
Fake_news                    0.681087
N                            0.681087
E                            0.681087
O                            0.681087
Verified_status_False        0.678283
Name: Downward_momentum_created, dtype: float64
In [2178]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2179]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 11 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.897390
N                            0.897390
E                            0.897390
O                            0.897390
Verified_status_False        0.893906
B5_N_Dm                      0.767947
B5_E_Dm                      0.767947
B5_O_Dm                      0.767947
Downward_momentum_created    0.767947
Volume                       0.549621
Name: Verified_status_True, dtype: float64
In [2180]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999969
N                            0.999969
E                            0.999969
O                            0.999969
Verified_status_True         0.893906
B5_N_Dm                      0.678283
B5_E_Dm                      0.678283
B5_O_Dm                      0.678283
Downward_momentum_created    0.678283
Volume                       0.677736
TRANGE                       0.649044
Variance                     0.528488
DMI                          0.504645
Name: Verified_status_False, dtype: float64
In [2181]:
sns.set(font_scale=0.8)
In [2182]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [2183]:
df.dtypes
Out[2183]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2184]:
df.isnull().sum()
Out[2184]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2185]:
df.fillna(0, inplace = True)
In [2186]:
df.dropna(inplace=True)
In [2187]:
sns.set(font_scale=0.8)
In [2188]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [2189]:
df.describe()
Out[2189]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 192.000000 192.000000 192.000000 192.000000 192.000000 1.920000e+02 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000 1.920000e+02 1.920000e+02 1.920000e+02 192.0 1.920000e+02 192.0 192.0 192.0 192.0 192.0 1.920000e+02 192.0 1.920000e+02 192.0 1.920000e+02 192.000000 192.000000 192.000000 192.0 192.000000 192.0 192.000000 192.000000 192.000000 192.000000 192.000000 192.000000
mean 770.562917 785.527549 754.949428 770.809322 770.809322 2.538587e+07 0.209910 0.766553 816.205489 32.716764 811.985427 721.842800 766.914113 13.673222 4.163695 32.844266 34.120150 11.499694 11.144082 0.355612 16.490469 1.597427 0.025935 55.885895 0.128767 52.965146 56.417876 56.597772 8.179319e+08 5.074038e+06 8.279505e+08 0.0 4.744614e+07 0.0 0.0 0.0 0.0 0.0 4.744614e+07 0.0 4.744614e+07 0.0 4.744614e+07 63.114583 4674.723958 4737.838542 0.0 4737.838542 0.0 4737.838542 4737.838542 0.209910 0.001669 0.027135 0.026729
std 170.456827 176.242155 163.122761 170.162721 170.162721 9.739109e+06 2.935941 0.357165 1408.852330 15.261383 193.398705 143.975228 166.811590 40.600020 1.334752 22.653615 25.457655 28.835383 27.082933 9.845492 89.727964 4.729407 0.107606 12.810017 0.327469 7.965855 23.807437 21.941664 9.839856e+07 1.330141e+07 2.325741e+08 0.0 3.400392e+07 0.0 0.0 0.0 0.0 0.0 3.400392e+07 0.0 3.400392e+07 0.0 3.400392e+07 39.247068 2210.142871 2245.294971 0.0 2245.294971 0.0 2245.294971 2245.294971 2.935941 0.029339 0.008446 0.008413
min 552.549988 566.210022 546.979980 563.460022 563.460022 9.800600e+06 -11.990296 -0.732332 8.112051 15.857966 594.664076 540.736287 578.178580 -59.704753 2.089077 9.169983 0.674235 -34.004668 -28.798312 -23.035439 -195.199951 -9.022349 -0.168725 30.137886 -0.272391 32.322336 7.978260 11.571510 6.785000e+08 -3.055088e+07 5.295056e+08 0.0 1.414173e+07 0.0 0.0 0.0 0.0 0.0 1.414173e+07 0.0 1.414173e+07 0.0 1.414173e+07 16.000000 2202.000000 2226.000000 0.0 2226.000000 0.0 2226.000000 2226.000000 -11.990296 -0.127723 0.014402 0.014402
25% 659.304993 666.410004 643.999985 656.250000 656.250000 1.850182e+07 -1.169610 0.527252 118.981362 21.287808 690.727727 626.965461 658.047141 -6.908829 3.064312 17.475006 15.442360 -6.910688 -6.338453 -3.397226 -28.335022 -0.787765 -0.040403 47.526313 -0.134923 46.635129 35.352979 38.382481 7.479462e+08 -3.536103e+06 6.383761e+08 0.0 2.757560e+07 0.0 0.0 0.0 0.0 0.0 2.757560e+07 0.0 2.757560e+07 0.0 2.757560e+07 40.000000 3179.750000 3222.250000 0.0 3222.250000 0.0 3222.250000 3222.250000 -1.169610 -0.011765 0.020978 0.020978
50% 710.839996 719.990021 701.989990 710.454987 710.454987 2.353565e+07 0.190414 0.737615 314.898087 25.122889 738.128392 670.284198 706.804278 10.612884 3.836345 27.290009 27.235595 11.245599 9.808429 1.213997 18.450012 1.573802 0.025185 54.030052 0.036283 52.901369 56.987522 57.054820 7.742272e+08 4.512362e+06 7.366648e+08 0.0 3.532263e+07 0.0 0.0 0.0 0.0 0.0 3.532263e+07 0.0 3.532263e+07 0.0 3.532263e+07 51.000000 4137.000000 4190.000000 0.0 4190.000000 0.0 4190.000000 4190.000000 0.190414 0.001902 0.024996 0.024343
75% 803.314987 813.092499 798.872513 807.059982 807.059982 3.054300e+07 1.840990 0.976373 729.995052 39.446491 807.736348 771.707313 789.859643 23.599107 5.139345 39.997482 48.490930 16.658031 16.146437 4.672527 52.612503 3.281958 0.079176 62.931757 0.267894 58.316329 78.418493 75.675742 8.586307e+08 1.230055e+07 9.989748e+08 0.0 5.338609e+07 0.0 0.0 0.0 0.0 0.0 5.338609e+07 0.0 5.338609e+07 0.0 5.338609e+07 74.000000 5444.000000 5513.500000 0.0 5513.500000 0.0 5513.500000 5513.500000 1.840990 0.018242 0.032566 0.030573
max 1234.410034 1243.489990 1217.000000 1229.910034 1229.910034 6.285210e+07 12.661595 1.893748 7315.693248 73.086370 1309.155080 1112.101288 1189.055699 159.212943 7.314463 162.979980 98.114812 114.203788 93.856278 30.888519 348.059998 16.619693 0.402010 94.198009 0.822076 77.940114 96.653419 93.889323 1.043720e+09 4.700579e+07 1.286465e+09 0.0 2.246558e+08 0.0 0.0 0.0 0.0 0.0 2.246558e+08 0.0 2.246558e+08 0.0 2.246558e+08 348.000000 21393.000000 21741.000000 0.0 21741.000000 0.0 21741.000000 21741.000000 12.661595 0.119218 0.047094 0.047094
In [2190]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [2191]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [2192]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [2194]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected TSLA
In [2195]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [2196]:
df.columns
Out[2196]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [2197]:
df.shape
Out[2197]:
(259, 52)
In [2198]:
df.isnull().sum()
Out[2198]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           4
NATR                          0
TRANGE                        0
DMI                           0
MACD                         12
MACDSIGNAL                   12
MACDHIST                     12
MOM                           0
PPO                           4
ROCP                          0
RSI                           0
TRIX                         67
ULTOSC                        7
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [2199]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2200]:
df_weekly = df.resample('W').agg('mean')
In [2201]:
df_weekly.shape
Out[2201]:
(55, 51)
In [2202]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[2202]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0770e35c10>
In [2203]:
sns.set(font_scale=0.8)
In [2204]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [2205]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
TRANGE          0.877333
NATR            0.821026
Upperband       0.723870
Middleband      0.670602
Variance        0.656320
High            0.644364
TRIX            0.632361
Open            0.632249
Adj Close       0.618176
Close           0.618176
Low             0.593488
Lowerband       0.584039
OBV             0.537728
Name: AvgTrueRange, dtype: float64
In [2206]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 4 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.821026
TRANGE          0.681933
Volume          0.610924
Name: NATR, dtype: float64
In [2207]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with TRANGE:
TRANGE                   1.000000
AvgTrueRange             0.877333
Variance                 0.773467
Upperband                0.712485
NATR                     0.681933
Verified_status_False    0.647620
Fake_news                0.646415
E                        0.646415
O                        0.646415
N                        0.646415
High                     0.644365
Middleband               0.642711
Open                     0.623003
Adj Close                0.610968
Close                    0.610968
TRIX                     0.591717
Low                      0.581036
Lowerband                0.535706
OBV                      0.527099
Verified_status_True     0.523892
Name: TRANGE, dtype: float64
In [2208]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999978
Verified_status_True         0.921376
B5_N_Dm                      0.759633
B5_E_Dm                      0.759633
B5_O_Dm                      0.759633
Downward_momentum_created    0.759633
Variance                     0.731984
TRANGE                       0.646415
DMI                          0.605546
Name: O, dtype: float64
In [2209]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: C, dtype: float64)
In [2210]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999978
Verified_status_True         0.921376
B5_N_Dm                      0.759633
B5_E_Dm                      0.759633
B5_O_Dm                      0.759633
Downward_momentum_created    0.759633
Variance                     0.731984
TRANGE                       0.646415
DMI                          0.605546
Name: E, dtype: float64
In [2211]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [2212]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999978
Verified_status_True         0.921376
B5_N_Dm                      0.759633
B5_E_Dm                      0.759633
B5_O_Dm                      0.759633
Downward_momentum_created    0.759633
Variance                     0.731984
TRANGE                       0.646415
DMI                          0.605546
Name: N, dtype: float64
In [2213]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2214]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2215]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2216]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2217]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2218]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.865871
Fake_news                    0.759633
N                            0.759633
E                            0.759633
O                            0.759633
Verified_status_False        0.756795
Name: B5_O_Dm, dtype: float64
In [2219]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Dm:
Series([], Name: B5_C_Dm, dtype: float64)
In [2220]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.865871
Fake_news                    0.759633
N                            0.759633
E                            0.759633
O                            0.759633
Verified_status_False        0.756795
Name: B5_E_Dm, dtype: float64
In [2221]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [2222]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.865871
Fake_news                    0.759633
N                            0.759633
E                            0.759633
O                            0.759633
Verified_status_False        0.756795
Name: B5_N_Dm, dtype: float64
In [2223]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
E                            1.000000
O                            1.000000
Verified_status_False        0.999978
Verified_status_True         0.921376
B5_N_Dm                      0.759633
B5_E_Dm                      0.759633
B5_O_Dm                      0.759633
Downward_momentum_created    0.759633
Variance                     0.731984
TRANGE                       0.646415
DMI                          0.605546
Name: Fake_news, dtype: float64
In [2224]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.865871
Fake_news                    0.759633
N                            0.759633
E                            0.759633
O                            0.759633
Verified_status_False        0.756795
Name: Downward_momentum_created, dtype: float64
In [2225]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2226]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.921376
N                            0.921376
E                            0.921376
O                            0.921376
Verified_status_False        0.918787
B5_N_Dm                      0.865871
B5_E_Dm                      0.865871
B5_O_Dm                      0.865871
Downward_momentum_created    0.865871
Variance                     0.583531
TRANGE                       0.523892
Name: Verified_status_True, dtype: float64
In [2227]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999978
N                            0.999978
E                            0.999978
O                            0.999978
Verified_status_True         0.918787
B5_N_Dm                      0.756795
B5_E_Dm                      0.756795
B5_O_Dm                      0.756795
Downward_momentum_created    0.756795
Variance                     0.733513
TRANGE                       0.647620
DMI                          0.606578
Name: Verified_status_False, dtype: float64
In [2228]:
sns.set(font_scale=0.8)
In [2229]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [2230]:
df_weekly.fillna(0, inplace = True)
In [2231]:
df_weekly.dropna(inplace=True)
In [2232]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [2233]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();